I first check the quality of all samples by looking at the number of reads per cell, the number of features (genes) per cell, and the percent mitochondrial genes per cell. In cases where the cells were dying, we generally see a higher percent mitochondria. I like to see that most cells have less than 15% of reads mapping to mitochondrial reads.
Here it seems that most cells look pretty good. You have ~4,000-5,000 genes per cell which is amazing. I normally see closer to 3,000 genes per cell. There are definitely some cells with a really high mitochondiral percent. I filter these cells out before continuing. In general, there were fewer AT02 cells that were filtered out because of quality concerns.
My filters
if(sample_info$type == "single"){
unprocessed_object <- readRDS(sample_info$seurat_unprocessed_path)
} else {
unprocessed_object <- sample_info$seurat_object
}
Idents(unprocessed_object) <- "orig.ident"
rna_qual <- VlnPlot(unprocessed_object,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 3)
rna_qualmedian_counts <- median(unprocessed_object$nCount_RNA)
median_features <- median(unprocessed_object$nFeature_RNA)
cell_number <- nrow(unprocessed_object[[]])adt_qual <- adt_qual <- VlnPlot(unprocessed_object,
features = c("nCount_ADT", "nFeature_ADT"))
adt_qualmedian_counts_adt <- median(unprocessed_object$nCount_ADT)
median_features_adt <- median(unprocessed_object$nFeature_ADT)I next identify likely doublets using DoubletFinder which attempts to model what doublets would look like based on a mixing of the different clusters. The doublets identified are shown below. I filter these out before continuing the analysis.
doublet_object <- readRDS(sample_info$seurat_doublet_path)
plotDimRed(doublet_object,
col_by = "Doublet_finder",
plot_type = "rna.umap")[[1]]final_object <- sample_info$seurat_object
median_counts <- median(final_object$nCount_RNA)
median_features <- median(final_object$nFeature_RNA)
cell_number <- nrow(final_object[[]])I next do an initial dimensional reduction with using PCA on the top 2000 variable genes. The PCA is not too informative, but it’s worth looking at a few metrics.
seurat_object <- sample_info$seurat_object
plot1 <- plotDimRed(seurat_object, col_by = "orig.ident",
plot_type = "pca", color = sample_colors)[[1]]
plot2 <- plotDimRed(seurat_object, col_by = c("nCount_RNA",
"nFeature_RNA",
"percent.mt"),
plot_type = "pca")
plot_grid(plot1, plot2[[1]], plot2[[2]], plot2[[3]],
labels = c("A", "B", "C", "D"),
align = "hv", axis = "lr")I next follow the PCA with a UMAP dimensional reduction. Rather than use genes for UMAP, we use the top PCs (~30 for all of these samples). I’ll plot the same metrics for UMAP as for the PCA.
seurat_object <- sample_info$seurat_object
cluster_colors <- sample_info$cluster_colors
plot1 <- plotDimRed(seurat_object, col_by = "orig.ident",
plot_type = "rna.umap", color = sample_colors)[[1]]
plot2 <- plotDimRed(seurat_object, col_by = c("nCount_RNA",
"nFeature_RNA",
"percent.mt"),
plot_type = "rna.umap")
plot3 <- plotDimRed(seurat_object, col_by = "RNA_cluster",
color = cluster_colors, plot_type = "rna.umap")[[1]]
plot_grid(plot1, plot2[[1]], plot2[[2]], plot2[[3]], plot3,
labels = c("A", "B", "C", "D", "E"),
nrow = 3, ncol = 2,
align = "hv", axis = "lr")I named celltypes based using clustifyr which uses a reference dataset to name clusters. First, average expression is found for each cluster in the reference dataset, then a correlation is run between the reference cluster and our clusters. I used The human single cell atlas as the reference.
seurat_object <- sample_info$seurat_object
if(sample_info$type == "single"){
plot1 <- plotDimRed(seurat_object, col_by = "RNA_cluster",
plot_type = "rna.umap")[[1]]
plot2 <- plotDimRed(seurat_object,
col_by = "RNA_celltype", plot_type = "rna.umap",
color = celltype_colors)[[1]]
barplot1 <- stacked_barplots(seurat_object = seurat_object,
meta_col = "RNA_celltype",
percent = TRUE,
color = celltype_colors)
plot_grid(plot1, plot2, NULL, NULL, barplot1,
labels = c("A", "B", "", "", "C"),
align = "hv", axis = "lr",
nrow = 2, ncol = 2)
} else {
plot1 <- plotDimRed(seurat_object, col_by = "uncorrected_cluster",
plot_type = "rna.umap")[[1]]
plot2 <- plotDimRed(seurat_object,
col_by = c("RNA_celltype",
"RNA_combined_celltype"),
plot_type = "rna.umap",
color = celltype_colors)
barplot1 <- stacked_barplots(seurat_object = seurat_object,
meta_col = "RNA_combined_celltype",
percent = TRUE,
color = celltype_colors,
split_by = "sample")
plot_grid(plot1, plot2[[1]], plot2[[2]], NULL, barplot1,
labels = c("A", "B", "C", "", "D"),
align = "hv", axis = "lr",
nrow = 3, ncol = 2)
}I next found differentially expressed genes between all of the clusters and cell types to see if this agrees with the cell type naming above. Overall, I’d agree with the cell types. I’m only showing the top 10 markers of each cell type/cluster below.
Because the sequencing depth is so low, I’d take these DE genes with a grain of salt.
sample_info <- sample_list[["healthy_bcells_1"]]
seurat_data <- sample_info$seurat_object
save_dir <- sample_info$save_dir
# Get marker genes that are saved
marker_genes_rna <- read.csv(file.path(save_dir,
"files/DE/rna_markers_RNA_celltype.csv"),
row.names = 1)
# Pick out the top 10 makers of each cluster
top10_rna <- marker_genes_rna %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n = 10, wt = avg_log2FC) %>%
dplyr::arrange(cluster)
# Make a heatmap with these genes
rna_heatmap <- plot_heatmap(seurat_data, top10_rna$gene, "RNA_celltype",
average_expression = FALSE)[[4]]
# Make a heatmap using the average values from each cluster
rna_heatmap_ave <- plot_heatmap(seurat_data, top10_rna$gene, "RNA_celltype",
average_expression = TRUE)[[4]]
plot_grid(rna_heatmap, rna_heatmap_ave,
nrow = 1, ncol = 2,
align = "hv",
axis = "tb")celltype_cluster_colors <- sample_info$celltype_cluster_colors
# Get marker genes that are saved
marker_genes_rna <- read.csv(file.path(save_dir,
"files/DE/RNA_markers_celltype_cluster.csv"),
row.names = 1)
# Pick out the top 10 makers of each cluster
top10_rna <- marker_genes_rna %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n = 10, wt = avg_log2FC) %>%
dplyr::arrange(cluster)
# Make a heatmap with these genes
rna_heatmap <- plot_heatmap(seurat_data, top10_rna$gene, "celltype_cluster",
average_expression = FALSE,
colors = celltype_cluster_colors)[[4]]
# Make a heatmap using the average values from each cluster
rna_heatmap_ave <- plot_heatmap(seurat_data, top10_rna$gene, "celltype_cluster",
average_expression = TRUE,
colors = celltype_cluster_colors)[[4]]
plot_grid(rna_heatmap, rna_heatmap_ave,
nrow = 1, ncol = 2,
align = "hv",
axis = "tb")merged_seurat <- sample_info$seurat_object
cluster_colors <- sample_info$cluster_colors
sep_1 <- violin_col_by_1 <- "RNA_cluster"
colors_1 <- cluster_colors
umap_cols_1 <- colors_1
if(sample_info$type == "combined"){
sep_2 <- violin_col_by_2 <- "RNA_combined_celltype"
} else {
sep_2 <- violin_col_by_2 <- "RNA_celltype"
}
colors_2 <- umap_cols_2 <- celltype_colors
plot_type = "rna.umap"
extra_pound_gene <- paste0(extra_pound_gene_all, "#")
gene_chunks <- genes %>%
map(~knit_expand(file = gene_template, sample = sample, group = group,
gene = .x, subset = FALSE))Plots of gene expression
Plot of gene expression on single cell UMAP
Plot of RNA_cluster in all cells
Violin plot of RNA_cluster in all cells
Plot of RNA_celltype in all cells
Violin plot of RNA_celltype in all cells
Plots of gene expression
Plot of gene expression on single cell UMAP
Plot of RNA_cluster in all cells
Violin plot of RNA_cluster in all cells
Plot of RNA_celltype in all cells
Violin plot of RNA_celltype in all cells
Plots of gene expression
Plot of gene expression on single cell UMAP
Plot of RNA_cluster in all cells
Violin plot of RNA_cluster in all cells
Plot of RNA_celltype in all cells
Violin plot of RNA_celltype in all cells
Plots of gene expression
Plot of gene expression on single cell UMAP
Plot of RNA_cluster in all cells
Violin plot of RNA_cluster in all cells
Plot of RNA_celltype in all cells
Violin plot of RNA_celltype in all cells
Plots of gene expression
Plot of gene expression on single cell UMAP
Plot of RNA_cluster in all cells
Violin plot of RNA_cluster in all cells
Plot of RNA_celltype in all cells
Violin plot of RNA_celltype in all cells
merged_seurat <- sample_info$seurat_object
cluster_colors <- sample_info$cluster_colors
sep_1 <- violin_col_by_1 <- "RNA_cluster"
colors_1 <- cluster_colors
umap_cols_1 <- colors_1
if(sample_info$type == "combined"){
sep_2 <- violin_col_by_2 <- "RNA_combined_celltype"
} else {
sep_2 <- violin_col_by_2 <- "RNA_celltype"
}
colors_2 <- umap_cols_2 <- celltype_colors
plot_type = "rna.umap"
extra_pound_gene <- paste0(extra_pound_gene_all, "#")
gene_chunks <- genes %>%
map(~knit_expand(file = gene_template, sample = sample, group = group,
gene = .x, subset = FALSE))Plots of gene expression
Plot of gene expression on single cell UMAP
Plot of RNA_cluster in all cells
Violin plot of RNA_cluster in all cells
Plot of RNA_celltype in all cells
Violin plot of RNA_celltype in all cells
Plots of gene expression
Plot of gene expression on single cell UMAP
Plot of RNA_cluster in all cells
Violin plot of RNA_cluster in all cells
Plot of RNA_celltype in all cells
Violin plot of RNA_celltype in all cells
Plots of gene expression
Plot of gene expression on single cell UMAP
Plot of RNA_cluster in all cells
Violin plot of RNA_cluster in all cells
Plot of RNA_celltype in all cells
Violin plot of RNA_celltype in all cells
Plots of gene expression
Plot of gene expression on single cell UMAP
Plot of RNA_cluster in all cells
Violin plot of RNA_cluster in all cells
Plot of RNA_celltype in all cells
Violin plot of RNA_celltype in all cells
Plots of gene expression
Plot of gene expression on single cell UMAP
Plot of RNA_cluster in all cells
Violin plot of RNA_cluster in all cells
Plot of RNA_celltype in all cells
Violin plot of RNA_celltype in all cells
I next looked at a density plot of each protein to see if there was any patterns that could be used to find cutoffs for identifying your subset. While there is certainly structure in the IgM and IgG that I can see, I don’t see as clear of cutoffs for CD21 and CXCR5.
seurat_object <- sample_info$seurat_object
if (group == "rna"){
plotting_df <- GetAssayData(seurat_object, slot = "data", assay = "RNA") %>%
as.matrix %>%
t %>%
data.frame %>%
dplyr::select(all_of(plotting_list))
} else if (group == "adt"){
plotting_df <- GetAssayData(seurat_object, slot = "data", assay = "CLR_ADT") %>%
as.matrix %>%
t %>%
data.frame %>%
dplyr::select(all_of(plotting_list))
} else {
stop("Group must be adt or rna")
}
plotting_df <- cbind(plotting_df, seurat_object[["RNA_cluster"]])
plotting_df_long <- plotting_df %>%
tidyr::pivot_longer(cols = all_of(plotting_list),
names_to = "protein",
values_to = "expression")
plot1 <- ggplot2::ggplot(data = plotting_df_long,
mapping = ggplot2::aes(expression, color = protein)) +
ggplot2::geom_density() +
ggplot2::scale_color_manual(values = protein_colors)
plot2 <- ggplot2::ggplot(data = plotting_df_long,
mapping = ggplot2::aes(x = expression,
color = protein,
y = protein,
fill = protein)) +
ggridges::geom_density_ridges() +
# ggridges::theme_ridges()
ggplot2::scale_color_manual(values = protein_colors) +
ggplot2::scale_fill_manual(values = protein_colors)
plot_grid(plot1, plot2,
labels = c("A", "B"),
nrow = 1, ncol = 2,
align = "hv", axis = "lr")I have repeated the density plots without CD27 so it is easier to visualize patterns.
seurat_object <- sample_info$seurat_object
if (group == "rna"){
plotting_df <- GetAssayData(seurat_object, slot = "data", assay = "RNA") %>%
as.matrix %>%
t %>%
data.frame %>%
dplyr::select(all_of(plotting_list))
} else if (group == "adt"){
plotting_df <- GetAssayData(seurat_object, slot = "data", assay = "CLR_ADT") %>%
as.matrix %>%
t %>%
data.frame %>%
dplyr::select(all_of(plotting_list))
} else {
stop("Group must be adt or rna")
}
plotting_df <- cbind(plotting_df, seurat_object[["RNA_cluster"]])
plotting_df_long <- plotting_df %>%
tidyr::pivot_longer(cols = all_of(plotting_list),
names_to = "protein",
values_to = "expression")
plot1 <- ggplot2::ggplot(data = plotting_df_long,
mapping = ggplot2::aes(expression, color = protein)) +
ggplot2::geom_density() +
ggplot2::scale_color_manual(values = protein_colors)
plot2 <- ggplot2::ggplot(data = plotting_df_long,
mapping = ggplot2::aes(x = expression,
color = protein,
y = protein,
fill = protein)) +
ggridges::geom_density_ridges() +
# ggridges::theme_ridges()
ggplot2::scale_color_manual(values = protein_colors) +
ggplot2::scale_fill_manual(values = protein_colors)
plot_grid(plot1, plot2,
labels = c("A", "B"),
nrow = 1, ncol = 2,
align = "hv", axis = "lr")I also tried with the RNA, but there wasn’t much expression. This will probably get better with deeper sequencing.
seurat_object <- sample_info$seurat_object
if (group == "rna"){
plotting_df <- GetAssayData(seurat_object, slot = "data", assay = "RNA") %>%
as.matrix %>%
t %>%
data.frame %>%
dplyr::select(all_of(plotting_list))
} else if (group == "adt"){
plotting_df <- GetAssayData(seurat_object, slot = "data", assay = "CLR_ADT") %>%
as.matrix %>%
t %>%
data.frame %>%
dplyr::select(all_of(plotting_list))
} else {
stop("Group must be adt or rna")
}
plotting_df <- cbind(plotting_df, seurat_object[["RNA_cluster"]])
plotting_df_long <- plotting_df %>%
tidyr::pivot_longer(cols = all_of(plotting_list),
names_to = "protein",
values_to = "expression")
plot1 <- ggplot2::ggplot(data = plotting_df_long,
mapping = ggplot2::aes(expression, color = protein)) +
ggplot2::geom_density() +
ggplot2::scale_color_manual(values = protein_colors)
plot2 <- ggplot2::ggplot(data = plotting_df_long,
mapping = ggplot2::aes(x = expression,
color = protein,
y = protein,
fill = protein)) +
ggridges::geom_density_ridges() +
# ggridges::theme_ridges()
ggplot2::scale_color_manual(values = protein_colors) +
ggplot2::scale_fill_manual(values = protein_colors)
plot_grid(plot1, plot2,
labels = c("A", "B"),
nrow = 1, ncol = 2,
align = "hv", axis = "lr")ADT_data <- GetAssayData(seurat_data, assay = "CLR_ADT", slot = "data") %>%
t %>%
data.frame
# CD27 - from Zach use top 20%
cd27_cutoff <- quantile(ADT_data$CD27.1, probs = .8)
# IgM - from zach bottom 20%
igm_cutoff <- quantile(ADT_data$IgM, prob = 0.2)
# IgD cutoff - a little less than 1, at the plateau
igd_cutoff <- quantile(ADT_data$IgD, prob = 0.5)
# CD21 cutoff CD21 Bottom 10-15% Maybe the bump around 0.5?
cd21_cutoff <- quantile(ADT_data$CD21, prob = 0.1)
ADT_data_long <- ADT_data %>%
tidyr::pivot_longer(cols = all_of(colnames(.)), names_to = "protein",
values_to = "value")# Plots of cutoffs
IgM_plot <- ADT_data_long %>%
dplyr::filter(protein == "IgM") %>%
ggplot2::ggplot(ggplot2::aes(x = value, fill = protein)) +
ggplot2::geom_density() +
ggplot2::scale_fill_manual(values = protein_colors_adt) +
ggplot2::geom_vline(xintercept = igm_cutoff)
print(IgM_plot)# Plots of cutoffs
IgD_plot <- ADT_data_long %>%
dplyr::filter(protein == "IgD") %>%
ggplot2::ggplot(ggplot2::aes(x = value, fill = protein)) +
ggplot2::geom_density() +
ggplot2::scale_fill_manual(values = protein_colors_adt) +
ggplot2::geom_vline(xintercept = igd_cutoff)
print(IgD_plot)# Plots of cutoffs
CD21_plot <- ADT_data_long %>%
dplyr::filter(protein == "CD21") %>%
ggplot2::ggplot(ggplot2::aes(x = value, fill = protein)) +
ggplot2::geom_density() +
ggplot2::scale_fill_manual(values = protein_colors_adt) +
ggplot2::geom_vline(xintercept = cd21_cutoff)
print(CD21_plot)I next found the cells that matched each cutoff (low IgM, high IgD, low CD21) and counted the number of cells that passed each filter. The scores are 0-3 where 0 means the cells didn’t pass any filter and 3 means the cell passed all filters.
There were only 42 cells that passed all of the filters
table(seurat_data$pass_filters)
0 1 2 3
3099 5576 1058 42
And they didn’t seem to align to any particular cluster (except maybe the T cell cluster?)
plot_1 <- plotDimRed(seurat_data, "pass_filters", plot_type = "rna.umap")[[1]]
plot_2 <- plotDimRed(seurat_data, "pass_filters",
plot_type = "rna.umap", highlight_group = TRUE,
group = 3, meta_data_col = "pass_filters")[[1]]
plot_3 <- plotDimRed(seurat_data, "celltype_cluster",
plot_type = "rna.umap", highlight_group = TRUE,
group = 3, meta_data_col = "pass_filters")[[1]]
plot_grid(plot_1, plot_2, plot_3, align = "hv", axis = "lr",
nrow = 2, ncol = 2, labels = c("A", "B", "C"))Below I’m showing the number of cells that match your cutoffs in each cluster. * celltype_cluster is the name of the cluster with the cell type appended to make it easier to see * n is the number of cells within that cluster that passed all cutoffs * total is the total number of cells in the cluster * fraction is n/total
seurat_data[[]] %>%
dplyr::select(celltype_cluster, pass_filters) %>%
dplyr::group_by(celltype_cluster, .drop = FALSE) %>%
dplyr::count(pass_filters) %>%
dplyr::group_by(celltype_cluster, .drop = FALSE) %>%
dplyr::mutate(total = sum(n)) %>%
dplyr::ungroup() %>%
dplyr::mutate(fraction = n/total) %>%
dplyr::filter(pass_filters == 3) %>%
dplyr::select(celltype_cluster, n, total, fraction)